1 Load necessary libraries

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install(c("GEOquery", "edgeR", "limma", "ggplot2", "reshape2"))

library(GEOquery)
library(edgeR)
library(limma)
library(ggplot2)
library(reshape2)
library(ggrepel)
library(biomaRt)
library(DT)
library(pheatmap)

# # Download supplementary files for GSE203206 [https://molecularbrain.biomedcentral.com/articles/10.1186/s13041-022-00963-2#MOESM2] if the directory doesn't exist.
# if (!dir.exists("GSE203206")) {
#     getGEOSuppFiles("GSE203206", makeDirectory = TRUE, baseDir = ".")
# }
# 
# url <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE203nnn/GSE203206/suppl/GSE203206_Subramaniam.ADRC_brain.counts.tsv.gz"
# destfile <- "./GSE203206/GSE203206_Subramaniam.ADRC_brain.counts.tsv.gz"
# dir.create("./GSE203206", showWarnings = FALSE)
# download.file(url, destfile, mode = "wb")

# Download the GEO dataset
getGEOSuppFiles("GSE203206", makeDirectory = TRUE)
# Unzip the metadata if needed
# untar("GSE203206/GSE203206_RAW.tar", exdir = "GSE203206/")

2 Part I: Differential Expression Analysis Using limma-voom

2.1 Step 1: Read data

counts <- read.table("./GSE203206/GSE203206_Subramaniam.ADRC_brain.counts.tsv.gz",
                     header = TRUE, sep = "\t", row.names = 1)

metadata <- read.table("./GSE203206/GSE203206_Subramaniam.ADRC_brain.metadata.tsv.gz",
                       header = TRUE, sep = "\t", row.names = 1)

# Define ALZHEIMER_TYPE based on 'Sample' prefix and filter only Early and Control
metadata$ALZHEIMER_TYPE <- NA
metadata$ALZHEIMER_TYPE[grepl("^EOL", metadata$Sample)] <- "Early"
metadata$ALZHEIMER_TYPE[grepl("^COL", metadata$Sample)] <- "Control"
metadata$ALZHEIMER_TYPE[grepl("^LOL", metadata$Sample)] <- "Late"

metadata <- metadata[!is.na(metadata$ALZHEIMER_TYPE), ]
metadata$ALZHEIMER_TYPE <- factor(metadata$ALZHEIMER_TYPE, levels = c("Control", "Early", "Late"))
## Counts matrix (first few rows):
##                 E_4920_OL_RNA_S1 E_5273_OL_RNA_S75 E_5618_OL_RNA_S62
## ENSG00000000003         51.68859          98.80348          47.63177
## ENSG00000000419         13.05284          28.37234          27.76082
## ENSG00000000457        185.46142         152.97943         113.97631
## ENSG00000000460         53.91895          51.50589          47.99208
## ENSG00000000938         44.99193          32.42752          57.72998
## ENSG00000000971        153.93871         110.24908         123.04170
##                 L_5342_OL_RNA_S95 L_5425_OL_RNA_S39 L_5492_OL_RNA_S78
## ENSG00000000003         43.932208          13.21789          24.11123
## ENSG00000000419         27.508268          12.08312          17.15611
## ENSG00000000457        112.441081          46.99159         103.12127
## ENSG00000000460         45.338682          37.11980          30.61393
## ENSG00000000938          7.129156          38.70423          25.22177
## ENSG00000000971         33.344420          52.37255          43.59417
##                 L_5566_OL_RNA_S92 L_5569_OL_RNA_S9 L_5594_OL_RNA_S65
## ENSG00000000003          42.24082         68.04576          69.49676
## ENSG00000000419           7.50626          0.00000           0.00000
## ENSG00000000457          60.79745        141.47573          92.16050
## ENSG00000000460          46.27109         53.75986          54.60007
## ENSG00000000938          29.92210         39.33452          53.73396
## ENSG00000000971          60.26747        182.64778         490.23938
##                 L_5017_OL_RNA_S87 L_5147_OL_RNA_S22 L_5197_OL_RNA_S20
## ENSG00000000003         42.984646          69.34930          33.73442
## ENSG00000000419          3.910817           6.56959          10.82245
## ENSG00000000457         80.805508          78.06097         103.28810
## ENSG00000000460         39.482685          56.00870          42.18951
## ENSG00000000938         46.092878         113.67277          49.17866
## ENSG00000000971         64.029192         517.03545         165.37552
##                 L_5252_OL_RNA_S19 L_5275_OL_RNA_S57 E_4934_OL_RNA_S40
## ENSG00000000003          32.38435          89.01499         18.057221
## ENSG00000000419          12.32396          72.18759          5.649749
## ENSG00000000457          77.45872         126.65909         22.995939
## ENSG00000000460          42.04894          80.52346         16.513741
## ENSG00000000938          33.92262          59.54354         16.998740
## ENSG00000000971          57.24617         204.16033         57.242921
##                 E_5047_OL_RNA_S43 E_5082_OL_RNA_S80 E_5107_OL_RNA_S98
## ENSG00000000003          5.983025         29.940971       25.84285907
## ENSG00000000419          0.000000          1.598278        0.01694824
## ENSG00000000457          6.046037         21.482075        9.53858873
## ENSG00000000460          1.496420         11.022296        4.65642648
## ENSG00000000938         31.442038         54.809737       46.04421144
## ENSG00000000971         28.095790         36.655915       15.86122752
##                 E_5333_OL_RNA_S25 E_5337_OL_RNA_S12 E_5419_OL_RNA_S14
## ENSG00000000003          37.83331         24.491256         25.455549
## ENSG00000000419           2.87569          1.186218          0.000000
## ENSG00000000457          42.46450         29.593666          7.574574
## ENSG00000000460          75.04347         22.767128          4.998618
## ENSG00000000938          80.28758         64.598344         15.084744
## ENSG00000000971          65.91267        108.837765         17.410583
##                 E_5454_OL_RNA_S21 E_5537_OL_RNA_S76 E_5585_OL_RNA_S6
## ENSG00000000003          30.94204         37.875627        57.640603
## ENSG00000000419          10.98872          9.031095         5.696223
## ENSG00000000457          72.37977         58.859506        35.689411
## ENSG00000000460          42.69146         26.252744        27.578701
## ENSG00000000938          94.16010         84.239635        83.229502
## ENSG00000000971         120.31361         97.306828       207.095468
##                 E_5020_OL_RNA_S79 E_4988_OL_RNA_S16 E_5113_OL_RNA_S23
## ENSG00000000003          36.82717          74.14136         45.952496
## ENSG00000000419          15.58039          11.53970          6.029349
## ENSG00000000457          46.25728          54.53149         72.955328
## ENSG00000000460          23.18493          26.59712         58.213898
## ENSG00000000938         109.96816         119.96690         95.993383
## ENSG00000000971          56.91853         219.50860        358.232096
##                 L_4950_OL_RNA_S64 L_5303_OL_RNA_S24 L_5366_OL_RNA_S5
## ENSG00000000003         20.566745         15.920322         19.41930
## ENSG00000000419          2.775672          8.057642          6.89522
## ENSG00000000457         57.421866         25.931982         31.65121
## ENSG00000000460         18.465445         13.176996         20.82778
## ENSG00000000938         36.424041         66.121533         41.98890
## ENSG00000000971        104.159476         62.262032         71.94089
##                 L_5376_OL_RNA_S61 L_5526_OL_RNA_S100 L_4964_OL_RNA_S32
## ENSG00000000003         14.937899          49.279873      2.567742e+01
## ENSG00000000419          3.322799           4.965268      9.091280e-08
## ENSG00000000457         21.781072          60.955968      4.192650e+01
## ENSG00000000460         11.784071          28.554476      3.467784e+01
## ENSG00000000938         45.589496          93.679074      9.661300e+01
## ENSG00000000971         41.879787         240.504291      7.947590e+01
##                 L_5065_OL_RNA_S37 L_5269_OL_RNA_S30 E_5184_OL_RNA_S46
## ENSG00000000003          85.24805         108.00403         41.046963
## ENSG00000000419          21.02972           0.00000          6.910645
## ENSG00000000457          61.41075          46.76018         48.171061
## ENSG00000000460          55.30758          35.88983         26.526717
## ENSG00000000938         122.93885         100.62998        133.947474
## ENSG00000000971         179.68781         270.17228        214.158209
##                 E_5240_OL_RNA_S73 E_5000_OL_RNA_S58 L_5394_OL_RNA_S4
## ENSG00000000003          75.89399          60.12211        54.094489
## ENSG00000000419           0.00000           4.95507         3.224707
## ENSG00000000457          44.71548          38.46552        19.841089
## ENSG00000000460          10.66665          38.88151        18.898575
## ENSG00000000938          98.34171         125.86505       148.393369
## ENSG00000000971         280.35927         801.86969       499.991450
##                 C_5628_OL_RNA_S44 C_5114_OL_RNA_S69 C_4942_OL_RNA_S90
## ENSG00000000003          23.76476          41.07581          28.04265
## ENSG00000000419          10.19532          11.45113          19.52941
## ENSG00000000457          94.32358         113.85271          71.97302
## ENSG00000000460          38.22775          40.30357          23.39992
## ENSG00000000938          17.07324          52.20809          25.28685
## ENSG00000000971          71.01627         188.12193          33.75536
##                 C_4954_OL_RNA_S28 C_5709_OL_RNA_S18 C_5070_OL_RNA_S60
## ENSG00000000003          52.16995          64.97861         62.994313
## ENSG00000000419          21.18185          25.99144          1.628206
## ENSG00000000457          74.81350         110.67929        104.467279
## ENSG00000000460          71.26355          49.29619         42.416638
## ENSG00000000938          86.49208          39.75260         99.771111
## ENSG00000000971         301.15890         494.08391         68.041632
##                 C_4870_OL_RNA_S68 C_5248_OL_RNA_S82
## ENSG00000000003        103.671885         69.463193
## ENSG00000000419          4.207163          5.521627
## ENSG00000000457        151.904752        174.692519
## ENSG00000000460         67.153599         55.376823
## ENSG00000000938         52.099429         36.343606
## ENSG00000000971         95.683895        207.515722
## 
## Metadata (first few rows):
##                   PATHID PMHOURS     PATHDX SEX AGE onsetAge APOE BRAAK1
## E_4920_OL_RNA_S1    4920      NA Alzheimers   M  67       60   34      6
## E_5273_OL_RNA_S75   5273      10 Alzheimers   M  41       35   33      6
## L_5342_OL_RNA_S95   5342      10 Alzheimers   F  83       76   33      6
## L_5425_OL_RNA_S39   5425       6 Alzheimers   F  80       73   34      6
## L_5492_OL_RNA_S78   5492       6 Alzheimers   M  79       72   33      6
## L_5566_OL_RNA_S92   5566       6 Alzheimers   M  84       72   33      6
##                   BRAAK.combo MFTANGLES IPTANGLES STTANGLES HPTANGLES MFPLAQUES
## E_4920_OL_RNA_S1          6.3         7         6        12        10        50
## E_5273_OL_RNA_S75         6.2        19        19         7        22        43
## L_5342_OL_RNA_S95         6.2         4         4         6        17        24
## L_5425_OL_RNA_S39         6.2         5         8         8        17        50
## L_5492_OL_RNA_S78         6.2         5         4         4        18        48
## L_5566_OL_RNA_S92         6.2         4         5        26        26        41
##                   IPPLAQUES STPLAQUES HPPLAQUES SUM_TANGLES EDUCATION BLESSED
## E_4920_OL_RNA_S1         50        50        25          35        14      33
## E_5273_OL_RNA_S75        50        44        46          67        16       6
## L_5342_OL_RNA_S95        21        35         8          31        20      16
## L_5425_OL_RNA_S39        50        34        16          38        16      30
## L_5492_OL_RNA_S78        43        35        13          31        12      16
## L_5566_OL_RNA_S92        43        -4         9          61        12      21
##                   DURATION MMSE DRS Sample MappingRate RIN ALZHEIMER_TYPE
## E_4920_OL_RNA_S1         8    0  60   EOL1       0.406 2.9          Early
## E_5273_OL_RNA_S75        6   NA  81  EOL12       0.408 3.6          Early
## L_5342_OL_RNA_S95        5   18  88  LOL11       0.395 2.1           Late
## L_5425_OL_RNA_S39        8    0  58  LOL15       0.325 2.5           Late
## L_5492_OL_RNA_S78        7   21 114  LOL16       0.333 3.3           Late
## L_5566_OL_RNA_S92       12   16  91  LOL19       0.370 3.1           Late
cat("\nCounts per Sample in PATHDX group (Healthy vs Alzheimer):\n")
## 
## Counts per Sample in PATHDX group (Healthy vs Alzheimer):
print(table(metadata$PATHDX))
## 
## Alzheimers    Healthy 
##         39          8
# Se a variável de early vs late estiver presente, por exemplo `ALZHEIMER_TYPE`
# Adicione essa análise também:
if ("Sample" %in% colnames(metadata)) {
  cat("\nSample Count per Alzheimer onset (Healthy, Early, Late):\n")
  print(table(metadata$ALZHEIMER_TYPE))
} else {
  cat("\nColumn 'ALzheimer Type' not  found in metadata.\n")
}
## 
## Sample Count per Alzheimer onset (Healthy, Early, Late):
## 
## Control   Early    Late 
##       8      19      20
ggplot(metadata, aes(x = ALZHEIMER_TYPE, fill = ALZHEIMER_TYPE)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribuition of samples per group", x = "Group", y = "No of samples") +
  theme(legend.position = "none")

2.2 Step 2: Match metadata and counts

metadata <- metadata[colnames(counts), ]

2.3 Step 3: Create DGEList

dge <- DGEList(counts = counts)

2.4 Step 4: Normalize counts

dge <- calcNormFactors(dge)

2.5 Step 5: Design matrix

# New matrix without interception
design <- model.matrix(~ 0 + ALZHEIMER_TYPE, data = metadata)
colnames(design) <- levels(metadata$ALZHEIMER_TYPE)

cat("\nDesign matrix for comparisson between Control, Early, Late:\n")
## 
## Design matrix for comparisson between Control, Early, Late:
print(head(design))
##                   Control Early Late
## E_4920_OL_RNA_S1        0     1    0
## E_5273_OL_RNA_S75       0     1    0
## E_5618_OL_RNA_S62       0     1    0
## L_5342_OL_RNA_S95       0     0    1
## L_5425_OL_RNA_S39       0     0    1
## L_5492_OL_RNA_S78       0     0    1
# Define contrasts for desired comparisons
contrast.matrix <- makeContrasts(
  Early_vs_Control = Early - Control,
  Late_vs_Control = Late - Control,
  Early_vs_Late = Early - Late,
  levels = design
)
## 
## Design matrix (first few rows):
##                   Control Early Late
## E_4920_OL_RNA_S1        0     1    0
## E_5273_OL_RNA_S75       0     1    0
## E_5618_OL_RNA_S62       0     1    0
## L_5342_OL_RNA_S95       0     0    1
## L_5425_OL_RNA_S39       0     0    1
## L_5492_OL_RNA_S78       0     0    1

2.6 Step 6: Voom transformation

# Aply voom
v <- voom(dge, design, plot = TRUE)

# Fitting model and apply contrast<a
fit <- lmFit(v, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

# Extrair resultados de cada contraste
res_early_vs_ctrl <- topTable(fit2, coef = "Early_vs_Control", number = Inf)
res_late_vs_ctrl  <- topTable(fit2, coef = "Late_vs_Control", number = Inf)
res_early_vs_late <- topTable(fit2, coef = "Early_vs_Late", number = Inf)

# Adicionar IDs
res_early_vs_ctrl$gene_id <- rownames(res_early_vs_ctrl)
res_late_vs_ctrl$gene_id  <- rownames(res_late_vs_ctrl)
res_early_vs_late$gene_id <- rownames(res_early_vs_late)

# Salvar resultados
write.csv(res_early_vs_ctrl, "DGE_Early_vs_Control.csv", row.names = FALSE)
write.csv(res_late_vs_ctrl,  "DGE_Late_vs_Control.csv", row.names = FALSE)
write.csv(res_early_vs_late, "DGE_Early_vs_Late.csv", row.names = FALSE)

# Contar genes significativos (ajuste FDR < 0.05 e |logFC| > 1)
cat("\nSummary of differentially expressed genes:\n")
## 
## Summary of differentially expressed genes:
cat("Early vs Control:", sum(res_early_vs_ctrl$adj.P.Val < 0.05 & abs(res_early_vs_ctrl$logFC) > 1), "genes\n")
## Early vs Control: 400 genes
cat("Late vs Control:",  sum(res_late_vs_ctrl$adj.P.Val  < 0.05 & abs(res_late_vs_ctrl$logFC)  > 1), "genes\n")
## Late vs Control: 0 genes
cat("Early vs Late:",    sum(res_early_vs_late$adj.P.Val < 0.05 & abs(res_early_vs_late$logFC) > 1), "genes\n")
## Early vs Late: 0 genes

The voom mean-variance trend plot demonstrates that the relationship between gene expression level and variance has been successfully modeled. The red trend line shows a smooth decrease in variance as the mean expression increases, which is expected in RNA-seq data and confirms that lowly expressed genes tend to exhibit higher variability. The stabilization of variance at higher expression levels indicates that the data are well-behaved and suitable for downstream linear modeling with limma. Overall, this plot validates the voom transformation and supports the reliability of the differential expression results.

2.7 Step 7: Fit linear model

fit <- lmFit(v, design)
fit <- eBayes(fit)

2.8 Step 8: Extract results

results_limma <- topTable(fit, coef = 2, number = Inf, sort.by = "none")
results_limma$gene_id <- rownames(results_limma)
final_output <- results_limma[, c("gene_id", "logFC", "adj.P.Val")]

write.csv(final_output, file = "DGE_results_limma_voom.csv", row.names = FALSE)
## 
## Differential Expression Analysis Complete. Results saved as 'DGE_results_limma_voom.csv'.

3 Part II: Metadata Analysis

3.1 Summary Statistics

## 
## ------ Metadata Analysis ------
## 
## Summary of Metadata:
##      PATHID        PMHOURS        PATHDX              SEX           
##  Min.   :4870   Min.   : 2.0   Length:47          Length:47         
##  1st Qu.:5056   1st Qu.: 6.0   Class :character   Class :character  
##  Median :5252   Median : 8.0   Mode  :character   Mode  :character  
##  Mean   :5251   Mean   : 9.5                                        
##  3rd Qu.:5422   3rd Qu.:12.0                                        
##  Max.   :5709   Max.   :39.0                                        
##                 NA's   :9                                           
##       AGE           onsetAge          APOE           BRAAK1     
##  Min.   :41.00   Min.   :35.00   Min.   :23.00   Min.   :0.000  
##  1st Qu.:67.25   1st Qu.:55.50   1st Qu.:33.00   1st Qu.:6.000  
##  Median :79.00   Median :71.00   Median :33.00   Median :6.000  
##  Mean   :74.20   Mean   :62.74   Mean   :33.18   Mean   :5.128  
##  3rd Qu.:83.00   3rd Qu.:73.00   3rd Qu.:34.00   3rd Qu.:6.000  
##  Max.   :97.00   Max.   :78.00   Max.   :34.00   Max.   :6.000  
##  NA's   :1       NA's   :8       NA's   :2                      
##   BRAAK.combo      MFTANGLES        IPTANGLES        STTANGLES    
##  Min.   :6.200   Min.   : 2.000   Min.   : 3.000   Min.   : 2.00  
##  1st Qu.:6.200   1st Qu.: 4.000   1st Qu.: 5.500   1st Qu.: 5.50  
##  Median :6.200   Median : 6.000   Median : 7.000   Median :10.00  
##  Mean   :6.218   Mean   : 7.385   Mean   : 8.179   Mean   :11.31  
##  3rd Qu.:6.200   3rd Qu.: 8.000   3rd Qu.:10.500   3rd Qu.:12.00  
##  Max.   :6.300   Max.   :24.000   Max.   :19.000   Max.   :66.00  
##  NA's   :8       NA's   :8        NA's   :8        NA's   :8      
##    HPTANGLES       MFPLAQUES       IPPLAQUES       STPLAQUES    
##  Min.   : 3.00   Min.   :24.00   Min.   :-4.00   Min.   :-4.00  
##  1st Qu.:13.50   1st Qu.:49.50   1st Qu.:46.00   1st Qu.:35.00  
##  Median :22.00   Median :50.00   Median :50.00   Median :47.00  
##  Mean   :22.59   Mean   :47.56   Mean   :44.46   Mean   :41.77  
##  3rd Qu.:27.50   3rd Qu.:50.00   3rd Qu.:50.00   3rd Qu.:50.00  
##  Max.   :66.00   Max.   :50.00   Max.   :50.00   Max.   :50.00  
##  NA's   :8       NA's   :8       NA's   :8       NA's   :8      
##    HPPLAQUES      SUM_TANGLES       EDUCATION        BLESSED         DURATION 
##  Min.   : 6.00   Min.   : 19.00   Min.   :10.00   Min.   : 4.00   Min.   : 5  
##  1st Qu.:14.00   1st Qu.: 31.00   1st Qu.:12.00   1st Qu.:11.00   1st Qu.: 8  
##  Median :18.00   Median : 39.00   Median :14.00   Median :30.00   Median :11  
##  Mean   :22.05   Mean   : 49.46   Mean   :14.36   Mean   :23.97   Mean   :10  
##  3rd Qu.:25.50   3rd Qu.: 60.50   3rd Qu.:16.00   3rd Qu.:33.00   3rd Qu.:12  
##  Max.   :50.00   Max.   :147.00   Max.   :20.00   Max.   :85.00   Max.   :15  
##  NA's   :8       NA's   :8        NA's   :11      NA's   :8       NA's   :11  
##       MMSE             DRS            Sample           MappingRate    
##  Min.   : 0.000   Min.   :  5.00   Length:47          Min.   :0.1150  
##  1st Qu.: 0.000   1st Qu.: 33.50   Class :character   1st Qu.:0.3295  
##  Median : 2.000   Median : 43.00   Mode  :character   Median :0.3750  
##  Mean   : 7.032   Mean   : 57.97                      Mean   :0.3608  
##  3rd Qu.:16.000   3rd Qu.: 89.00                      3rd Qu.:0.4005  
##  Max.   :28.000   Max.   :121.00                      Max.   :0.5020  
##  NA's   :16       NA's   :8                                           
##       RIN       ALZHEIMER_TYPE
##  Min.   :1.00   Control: 8    
##  1st Qu.:1.75   Early  :19    
##  Median :2.20   Late   :20    
##  Mean   :2.24                 
##  3rd Qu.:2.65                 
##  Max.   :3.60                 
## 

The metadata analysis reveals a cohort of 47 samples with a balanced distribution across Alzheimer’s disease stages: 8 control, 19 early-stage, and 20 late-stage cases. The average age was 74.2 years, with disease onset typically occurring around 62.7 years. Neuropathological markers such as BRAAK stages and plaque/tangle scores show expected variability, with several missing values across key measures. Cognitive scores (MMSE, DRS) and biological quality metrics (RIN, MappingRate) also vary, reflecting a heterogeneous dataset suitable for downstream stratified analyses.

3.2 Descriptive Statistics

## 
## Descriptive Statistics for AGE:
## Mean Age: 74.19565
## Median Age: 79
## Age Range: 41 - 97
## 
## Descriptive Statistics for EDUCATION:
## Mean Education: 14.36111
## Median Education: 14
## Education Range: 10 - 20
## 
## Metadata analysis complete.

Descriptive statistics indicate that participants had a mean age of 74.2 years (range: 41–97) and a median of 79, suggesting a slightly right-skewed age distribution. Education levels were relatively high, with a mean of 14.4 years and a range of 10 to 20 years, indicating a well-educated cohort. These demographic features provide important context for interpreting clinical and molecular findings in the dataset.

4 GLM and Visualization Workflow

4.1 Filtering and Model Fitting

# Filter low count genes
keep <- filterByExpr(dge, design)
dge <- dge[keep, , keep.lib.sizes=FALSE]

# Normalize with TMM
dge <- calcNormFactors(dge)

# Estimate dispersion
dge <- estimateDisp(dge, design)

# Fit the model with GLM
fit <- glmFit(dge, design)

# Perform statistical test (Likelihood Ratio Test)
lrt <- glmLRT(fit)

# View top differentially expressed genes
topTags(lrt)
## Coefficient:  Late 
##                     logFC      logCPM       LR PValue FDR
## ENSG00000180061 -20.61724 0.009724043 1760.337      0   0
## ENSG00000259964 -20.50310 0.083881772 1817.169      0   0
## ENSG00000261696 -20.45203 0.130828238 1898.934      0   0
## ENSG00000204118 -20.44666 0.181243181 3396.740      0   0
## ENSG00000230465 -20.40902 0.212726469 3583.873      0   0
## ENSG00000116194 -20.39605 0.261833078 2178.738      0   0
## ENSG00000234737 -20.38718 0.225428248 1553.236      0   0
## ENSG00000197813 -20.38375 0.108890513 2101.287      0   0
## ENSG00000287260 -20.38003 0.285945836 3863.954      0   0
## ENSG00000241345 -20.37694 0.479381725 1628.000      0   0
# View top differentially expressed genes
results <- topTags(lrt, n = Inf)$table
results$Gene <- rownames(results)

4.2 Gene Annotation with biomaRt

# Connect to Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# Convert Ensembl IDs
converted <- getBM(
  attributes = c("ensembl_gene_id", "hgnc_symbol"),
  filters = "ensembl_gene_id",
  values = results$Gene,
  mart = ensembl
)

# Merge with results
results_annot <- merge(results, converted, by.x = "Gene", by.y = "ensembl_gene_id")

# Filter significant genes (e.g., FDR < 0.05 and |log2FC| > 1)
results_annot$Significant <- ifelse(results_annot$FDR < 0.05 & abs(results_annot$logFC) > 1, "yes", "no")

# Select top genes for labeling
top_genes <- subset(results_annot, Significant == "yes")

4.3 Volcano Plot with Gene Labels

# Unir resultados corretos ao gene annotation
results_early_vs_ctrl <- topTable(fit2, coef = "Early_vs_Control", number = Inf)
results_early_vs_ctrl$Gene <- rownames(results_early_vs_ctrl)

# Anotar com biomaRt
library(biomaRt)
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
converted <- getBM(
  attributes = c("ensembl_gene_id", "hgnc_symbol"),
  filters = "ensembl_gene_id",
  values = results_early_vs_ctrl$Gene,
  mart = ensembl
)
results_annot <- merge(results_early_vs_ctrl, converted,
                       by.x = "Gene", by.y = "ensembl_gene_id")

# Marcar genes significativos
results_annot$Significant <- ifelse(
  results_annot$adj.P.Val < 0.05 & abs(results_annot$logFC) > 1,
  "yes", "no"
)

# Limitar valores extremos
results_annot$logFC <- pmax(pmin(results_annot$logFC, 5), -5)
results_annot$log10FDR <- -log10(pmax(results_annot$adj.P.Val, 1e-10))

# Selecionar top genes para label
top_genes <- results_annot %>%
  dplyr::filter(Significant == "yes") %>%
  dplyr::arrange(adj.P.Val) %>%
  dplyr::slice_head(n = 15)

# Volcano plot
ggplot(results_annot, aes(x = logFC, y = log10FDR, color = Significant)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("no" = "grey", "yes" = "coral")) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  ggrepel::geom_text_repel(
    data = top_genes,
    aes(label = hgnc_symbol),
    size = 3,
    max.overlaps = 20,
    box.padding = 0.5
  ) +
  theme_minimal() +
  labs(
    title = "Volcano Plot: Early vs Control",
    x = "Log2 Fold Change",
    y = "-Log10 Adjusted P-Value"
  )

# Verifica se os genes estão no topo dos resultados
subset(results_annot, hgnc_symbol %in% c("EPHA5-AS1", "LDHAP4", "LIRIL2R" , "CAMK1G" ,"TMEM54", "FOXO1", "NELL1", "MDGA1", "OSGEP", "UBXN10", "CA11P", "SYTL2", "FOXQ1", "ABCA11P", "FLT3"))[, c("hgnc_symbol", "logFC", "adj.P.Val")]

A subset of differentially expressed genes between Early Alzheimer and Control samples revealed both upregulated and downregulated transcripts. Notably, LIRIL2R, EPHA5-AS1, and ABCA11P showed strong downregulation in Early samples (logâ‚‚FC < -1.7, adj. p < 0.05), while FOXQ1 and TMEM54 were among the most upregulated. While FOXO1 appeared visually in the volcano plot, its adjusted p-value (0.109) indicates it did not pass statistical significance thresholds, highlighting the importance of careful result interpretation.

4.4 Density and Boxplot of Normalized Counts

# Using voom-normalized data
plotDensities(v$E, main = "Density of Normalized Expressions")

boxplot(v$E, main = "Boxplot - Normalized Expression", las = 2, cex.axis = 0.7)

The density plot and boxplot of normalized gene expression values demonstrate successful normalization across samples. The density curves overlap substantially, indicating consistent global expression distributions and absence of large-scale technical bias. Similarly, the boxplot shows aligned medians and comparable interquartile ranges across samples, suggesting that expression values are well-centered and scaled. The presence of outliers is expected in RNA-seq data and reflects biological variability rather than technical artifacts. Together, these plots confirm the data is suitable for downstream differential expression analysis.

4.5 Principal Component Analysis (PCA)

# Filtering for Early  vs Control
metadata_filtered <- metadata[metadata$ALZHEIMER_TYPE %in% c("Control", "Early"), ]


# Making sure v$E are the filtered samples
v_filtered <- v$E[, rownames(metadata_filtered)]

# PCA
pca <- prcomp(t(v_filtered))
pca_df <- data.frame(pca$x[, 1:2], Group = metadata_filtered$ALZHEIMER_TYPE)

# Plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA: Early vs Control", x = "PC1", y = "PC2")

4.6 Histograms of logFC and FDR

ggplot(results_annot, aes(x = logFC)) +
  geom_histogram(bins = 60, fill = "skyblue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Log2 Fold Change", x = "logFC")

The distribution of log2 fold changes between Early Alzheimer and Control samples follows a near-normal pattern centered around zero, indicating that the majority of genes exhibit no substantial expression change between groups. This symmetry and concentration around zero are expected in transcriptome-wide analyses, with only a minority of genes showing biologically meaningful up- or downregulation. The tails of the distribution highlight differentially expressed genes and are consistent with the genes identified in the volcano plot.

5 Outlier and Batch Effect Analysis

We assessed potential outliers and batch effects using PCA and sample metadata variables (e.g., RIN, PMHOURS, MappingRate). This is essential to ensure that technical variation does not confound biological interpretation.

5.1 PCA Visualization by Technical Variables

# Match expression data
v_filtered <- v$E[, rownames(metadata_filtered)]

# Perform PCA
pca <- prcomp(t(v_filtered), scale. = TRUE) 
pca_df <- data.frame(pca$x[, 1:2],
                     RIN = metadata_filtered$RIN, PMHOURS = metadata_filtered$PMHOURS,
                     MappingRate = metadata_filtered$MappingRate)

# PCA colored by RIN

ggplot(pca_df, aes(x = PC1, y = PC2, color = RIN)) + geom_point(size = 3) + scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "PCA Colored by RIN", x = "PC1", y = "PC2")

# PCA colored by PMHOURS

ggplot(pca_df, aes(x = PC1, y = PC2, color = PMHOURS)) + geom_point(size = 3) + scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "PCA Colored by PMHOURS", x = "PC1", y = "PC2")

# PCA colored by Mapping Rate

ggplot(pca_df, aes(x = PC1, y = PC2, color = MappingRate)) + geom_point(size = 3) + scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "PCA Colored by Mapping Rate", x = "PC1", y = "PC2")

5.2 Outlier Detection Based on PCA Distance

# Distance from origin in PCA space
dists <- sqrt(rowSums(pca$x[, 1:2]^2))
threshold <- mean(dists) + 3 * sd(dists)
metadata_filtered$Outlier <- dists > threshold

# Visualize outliers
pca_df$Outlier <- metadata_filtered$Outlier

ggplot(pca_df, aes(x = PC1, y = PC2, color = Outlier)) +
  geom_point(size = 3) +
  scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
  theme_minimal() +
  labs(title = "PCA: Outlier Detection", x = "PC1", y = "PC2")

5.3 Batch Effect Correction

# Remove effect of RIN, if it's a confounder
library(limma)
v_corrected <- removeBatchEffect(v_filtered, covariates = metadata_filtered$RIN)

# PCA after correction
pca_corr <- prcomp(t(v_corrected), scale. = TRUE)
pca_corr_df <- data.frame(pca_corr$x[, 1:2], Group = metadata_filtered$ALZHEIMER_TYPE)

ggplot(pca_corr_df, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA After Batch Effect Correction", x = "PC1", y = "PC2")

Batch effect correction was performed using the limma::removeBatchEffect() function, accounting for RIN and MappingRate as covariates. PCA plots after correction show reduced technical bias, with improved separation of biological groups (Early vs Control), suggesting that the correction was effective.

5.4 Interactive Table of Results

datatable(results_annot, options = list(pageLength = 20),
          caption = 'Differential Expression Analysis Results')